# ------------------------------
# ' merge NVDRS ACS RCMS county level measures all together
# ------------------------------

# load processed data NVDRS
nvdrs = read_fst(file.path(processed_path,'nvdrs_v1.fst'),as.data.table=TRUE)
names(nvdrs) = gsub('\\.','_',names(nvdrs))

# add uid variable
nvdrs[, uid := paste0(Year,
    	'_',str_pad(Site, width=2, pad='0'),
    	'_',str_pad(IncID, width=4, pad='0'),
    	'_',str_pad(PerID, width=4, pad='0'))]

# add circumstance variable to nvdrs derived file
acs_county = read_fst(file.path(processed_path, 'acs_matched_county.fst'), as.data.table=TRUE)
acs_county = acs_county[!is.na(uid),]

#---------------------------------------------------------------------------------
# recreate some geo indicator and other variables
acs_county[,state := State]
nvdrs[,state := str_pad(State, width=2, pad='0')]
#=================================================================================

#------------------------------------------------------------------------------
# RCMS data
rcms = read_fst(file.path(processed_path, 'rcms_reltrad.fst'), as.data.table=TRUE)

# create proportion measures
rcms_var = c('GC_ProE','GC_ProM','GC_Cath','GC_Oth','GC_Jew','GC_ProB')
for (var in rcms_var) {
	rcms[[var]] = rcms[[var]] / rcms[['TOTPOP']] * 10000
}
names(rcms) = gsub('GC_','Rat_GC_',names(rcms))

# other county level measures
land_area = fread(file.path(rawdata_path,'county','county_2010_landarea.csv'))
land_area = land_area[county_fips != 0,]

pop_total = fread(file.path(rawdata_path,'county','county_2000_2019_population.csv'))
pop_total = pop_total[year >= 2005 & year <= 2011,]

pop_den = merge(pop_total,land_area, by='county_fips',all.x=TRUE)
pop_den[,Pop_Den := value / land_area]
#pop_den[,mean(Pop_Den,na.rm=TRUE)]

poverty = fread(file.path(rawdata_path,'county','county_2005_2017_poverty.csv'))
setnames(poverty,'time','year')
setnames(poverty,'GEOID','county_fips')
setnames(poverty,'SAEPOVRTALL_PT','Rat_Poverty')
poverty = poverty[year >= 2005 & year <= 2011,]

migration = fread(file.path(rawdata_path,'county','county_migration_in_out_2004_2017.csv'))
migration = migration[year >= 2005 & year <= 2011,]
setnames(migration, 'out_migration_rate','Rat_Mig_Cum')

county_macro = merge(pop_den[,c('county_fips','year','Pop_Den')], poverty[,c('county_fips','year','Rat_Poverty')], by=c('county_fips','year'),all.x=TRUE,all.y=TRUE)
county_macro = merge(county_macro, migration[,c('county_fips','year','Rat_Mig_Cum')], by=c('county_fips','year'),all.x=TRUE,all.y=TRUE)
county_macro[, county_fips := str_pad(county_fips, width=5, pad='0')]
county_macro = merge(county_macro, rcms, by.x=c('county_fips'),by.y='county_fips',all.x=TRUE,all.y=TRUE)
#==============================================================================

#------------------------------------------------------------------------------
# rbind ACS and NVDRS data
# first select key variables 
keep_var = c('uid','DSID','AgeGrp4','Age','Sex','Race4','Hisp','BornUSA','MarStat6','UnEmpl','PhysProb','state','Year')
keep_var_acs = c('county_weight','HseWgt','county')
keep_var_nvdrs = c('county')

nvdrs[, county := paste0(str_pad(State,width=2,pad='0'),str_pad(Cnty,width=3, pad='0'))]

sel_nvdrs = nvdrs[,c(keep_var,keep_var_nvdrs), with=FALSE]
sel_acs = acs_county[county %in% unique(nvdrs[,county]), c(keep_var,keep_var_acs), with=FALSE]

# bind two data sets
reg_data = rbind(sel_acs,sel_nvdrs, fill=TRUE)

#==============================================================================
# merge nvdrs data with processed acs macro
reg_data_merged = merge(x=reg_data, y=county_macro, by.x=c('county','Year'), by.y=c('county_fips','year'), all.x=TRUE)

# adjust weights using the existing weights (and it works)
reg_data_merged[DSID == 1, county_weight := 1]

# to reduce the datafile size using integer specification 
for (var in c('DSID','AgeGrp4','Age','Sex','Race4','Hisp','BornUSA','MarStat6','UnEmpl','PhysProb')) {
	reg_data_merged[[var]] = as.integer(reg_data_merged[[var]])
}

# drop counties when there's no comparable ACS counties
# -- 19 county, and 44 suicide cases
#weighted_mean = reg_data_merged[, weighted.mean(DSID, county_weight), by='county']
#all_death_county = weighted_mean[V1 == 1, county] 
#nvdrs[county %in% all_death_county, ]
#acs_county[county %in% all_death_county,]

reg_data_merged[, p_suicide := mean(DSID), by=c('county')]
reg_data_merged = reg_data_merged[p_suicide < 1,]
reg_data_merged[,p_suicide := NULL]

# check yearly rate : looks comparable
#reg_data_merged[,weighted.mean(DSID,county_weight)*10^5, by='Year']
#reg_data_merged[,round(weighted.mean(DSID,county_weight)*10^5,2), by='Race4']

write_fst(reg_data_merged, file.path(processed_path,'merged_county_v1.fst'), 100)
